In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
In [2]:
def straight_line(x):
return 5 * x + 8
In [3]:
straight_line(25)
Out[3]:
In [4]:
straight_line(1.254)
Out[4]:
In [5]:
np.random.seed(5)
samples = 150
x_vals = pd.Series(np.random.rand(samples) * 20)
y_vals = x_vals.map(straight_line)
# Add random noise
y_noisy_vals = y_vals + np.random.randn(samples) * 3
In [6]:
df = pd.DataFrame({'x': x_vals,
'y':y_vals,
'y_noisy': y_noisy_vals})
In [7]:
df.head()
Out[7]:
In [8]:
# Correlation will indicate how strongly features are related to the output
df.corr()
Out[8]:
In [9]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df.x,
y = df.y,
label = 'ideal fit')
plt.scatter(x = df.x,
y = df.y_noisy,
color = 'r',
marker = '+',
label = 'Target')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()
Out[9]:
In [10]:
data_path = '..\Data\RegressionExamples\straight_line'
In [11]:
df.to_csv(os.path.join(data_path, 'straight_line_example_all.csv'),
index = True,
index_label = 'Row')
In [12]:
# 130 rows for Training + Eval Set
df[df.index < 130].to_csv(os.path.join(data_path,'straight_line_noisy_example_train.csv'),
index = True,
index_label = 'Row'
,columns = ['x','y_noisy'])
In [13]:
# run all the samples for prediction
df.to_csv(os.path.join(data_path, 'straight_line_example_test_all.csv'),
index = True,
index_label = 'Row',
columns = 'x')
df_predicted = pd.read_csv(os.path.join(data_path, 'output_straight_line_noisy', 'bp-oDvVSUKSpPe-straight_line_example_test_all.csv'))
In [14]:
df_predicted = pd.read_csv('./output/bp-oDvVSUKSpPe-straight_line_example_test_all.csv.gz')
In [15]:
df_predicted.head()
Out[15]:
In [16]:
df_predicted.columns = ["Row", "y_predicted"]
In [17]:
df_predicted.index = df_predicted.Row
In [18]:
df_predicted.head()
Out[18]:
In [19]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df.x,
y = df.y_noisy,
color = 'r',
label = 'actual',)
plt.scatter(x = df.x,
y = df_predicted.y_predicted,
color = 'b',
label = 'predicted')
plt.grid(True)
plt.legend()
Out[19]:
In [20]:
# Training Data Residuals
residuals = (df_predicted.y_predicted - df.y_noisy)
fig = plt.figure(figsize = (12, 8))
plt.hist(residuals)
plt.grid(True)
plt.xlabel('(Predicted - Actual)')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color = 'g')
# left of 0 = prediction < actual
# right of 0 = prediction > actual
Out[20]:
In [21]:
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df.y_noisy, df_predicted.y_predicted],
labels = ['actual','predicted'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target')
plt.grid(True)
In [22]:
df_predicted_numeric = pd.read_csv('./output-numeric/bp-VNV4qb98Jmd-straight_line_example_test_all.csv.gz')
In [23]:
df_predicted_numeric.columns = ["Row", "y_predicted"]
In [24]:
df_predicted_numeric.head()
Out[24]:
In [25]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df.x,
y = df.y_noisy,
color = 'r',
label = 'actual',)
plt.scatter(x = df.x,
y = df_predicted.y_predicted,
color = 'k',
label = 'predicted bin')
plt.scatter(x = df.x,
y = df_predicted_numeric.y_predicted,
color = 'b',
label = 'predicted num')
plt.legend()
Out[25]:
In [26]:
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df.y_noisy, df_predicted.y_predicted, df_predicted_numeric.y_predicted],
labels = ['actual','predicted-bin','predicted-numeric'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target')
plt.grid(True)
RMSE (Root Mean Square Error) is the evaluation metric for Linear Regression. Smaller the value of RMSE, better the predictive accuracy of model. Perfect model would have RMSE of 0.
To prepare data for AWS ML, it requires data to be in:
Batch Prediction results are stored by AWS ML to S3 in the specified bucket
We pulled the data from S3 to local folder and plotted them
Based on the distribution of data, AWS ML suggests a recipe for processing data.
In case of numeric features, it may suggest binning the data instead of treating a raw numeric
For this example, treating x as numeric provided best results.